library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr)
library(ggrepel)

humandf <- read_tsv("human_biomart.txt")
## Rows: 67128 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene stable ID, Chromosome/scaffold name
## dbl (2): Gene start (bp), Gene end (bp)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
zfishdf <- read_tsv("zfish_biomart.txt")
## Rows: 37241 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene stable ID, Chromosome/scaffold name
## dbl (2): Gene start (bp), Gene end (bp)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rbhdf <- read_tsv("Human_Zebrafish_RBH.tsv")
## Rows: 5476 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Human Gene ID, Human Protein ID, Human Gene Name, Zebrafish Gene ID...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(humandf)[1] <- 'Human Gene ID' 
names(humandf)[2] <- 'Human Chrom' 
names(humandf)[3] <- 'Human Gene Start' 
names(humandf)[4] <- 'Human Gene End' 

names(zfishdf)[1] <- 'Zebrafish Gene ID'
names(zfishdf)[2] <- 'Zebrafish Chrom'
names(zfishdf)[3] <- 'Zebrafish Gene Start'
names(zfishdf)[4] <- 'Zebrafish Gene End'


merged <- left_join(left_join(rbhdf, zfishdf), humandf)
## Joining, by = "Zebrafish Gene ID"
## Joining, by = "Human Gene ID"
a <- table(merged$`Zebrafish Chrom`)
b <- table(merged$`Human Chrom`)


zfish_chrom_count <- a[order(-as.numeric(names(a)), decreasing = TRUE)]
## Warning in order(-as.numeric(names(a)), decreasing = TRUE): NAs introduced by
## coercion
human_chrom_count <- b[order(-as.numeric(names(b)), decreasing = TRUE)]
## Warning in order(-as.numeric(names(b)), decreasing = TRUE): NAs introduced by
## coercion
zfish_chrom_count <- as.data.frame(zfish_chrom_count[1:25])
human_chrom_count <- as.data.frame(human_chrom_count[1:22])
names(zfish_chrom_count)[1] <- 'Zebrafish Chromosome Number'
names(human_chrom_count)[1] <- 'Human Chromosome Number'
print(zfish_chrom_count)
##    Zebrafish Chromosome Number Freq
## 1                            1  253
## 2                            2  264
## 3                            3  236
## 4                            4  195
## 5                            5  326
## 6                            6  251
## 7                            7  272
## 8                            8  223
## 9                            9  224
## 10                          10  186
## 11                          11  187
## 12                          12  198
## 13                          13  242
## 14                          14  172
## 15                          15  163
## 16                          16  207
## 17                          17  225
## 18                          18  197
## 19                          19  175
## 20                          20  265
## 21                          21  216
## 22                          22  136
## 23                          23  180
## 24                          24  163
## 25                          25  194
print(human_chrom_count)
##    Human Chromosome Number Freq
## 1                        1  550
## 2                        2  433
## 3                        3  354
## 4                        4  236
## 5                        5  279
## 6                        6  276
## 7                        7  273
## 8                        8  175
## 9                        9  220
## 10                      10  255
## 11                      11  296
## 12                      12  305
## 13                      13   83
## 14                      14  177
## 15                      15  192
## 16                      16  245
## 17                      17  313
## 18                      18   84
## 19                      19  228
## 20                      20  150
## 21                      21   48
## 22                      22  130
n_tbl <- tibble(zfish_start = NA,   
                  human_chrom = NA,
                  zfish_chrom = NA)

for (i in seq(1:25)){
  index <- which(merged$`Zebrafish Chrom` == i)
  for (x in index) {
    n_tbl%>%
      add_row(zfish_start = merged[x, ]$`Zebrafish Gene Start`,
              human_chrom = merged[x, ]$`Human Chrom`,
              zfish_chrom = merged[x, ]$`Zebrafish Chrom`)->n_tbl
  }
}

grouped_n_tbl <- n_tbl %>%
  group_by(zfish_chrom)
  
ordered_n_tbl <- grouped_n_tbl[order(as.numeric(grouped_n_tbl$"zfish_chrom")), ]

d_list <- group_split(ordered_n_tbl)

for (i in 1:25) {
  
  a <- tibble(zfish_start <- d_list[[i]][,1],
              human_chrom <- d_list[[i]][,2])
  plot_name <- as.character(d_list[[i]][,3][1,])
      
  a <- a[!str_detect(a$human_chrom, '([A-Za-z])'), ]
  a <- a[!str_detect(a$human_chrom, '\\.$'), ]
  a$human_chrom <- unlist(as.numeric(a$human_chrom))
  a$zfish_start <- unlist(as.numeric(a$zfish_start))
  a <- a[order(a$human_chrom),]
  title1 <- paste("\nPutative Orthologs Between Zebrafish Chromosome",i)
  title2 <- paste(title1, "and All Human Chromosomes\n")
  
  print( ggplot(a, aes(x = factor(zfish_start), y = factor(human_chrom), fill=factor(human_chrom))) +
                            geom_dotplot(binaxis = "y", 
                                         stackdir = "centerwhole", 
                                         binwidth = 0.38, color=NA) +
                            scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
                            theme(axis.text.x = element_text(size=7, angle = 90, hjust = 1), 
                                  legend.position = "none",
                                  plot.title = element_text(size = 10, face = "bold")) +
                            xlab("\nGene Start Positions As Basepair #") +
                            ylab("Human Chromosome #") +
                            ggtitle(title2))
    }